In [58]:
'''U.S. SAMPLE'''
In [96]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt

plt.style.use('bmh')

os.chdir('/Users/lockiemichalski/Documents/Files_Mac_before_clean/Desktop/UQ/RQ3_2020/Credit_rating_WRDS/Steps_sample/Files to upload rand/US_data/Draft2_data')
In [97]:
data = pd.read_csv('US_SAMPLE_w_ESG.csv').iloc[:,1:]
/Users/lockiemichalski/opt/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3071: DtypeWarning: Columns (5,210,248,289,295,398,399,400,401,402,403,460,461) have mixed types.Specify dtype option on import or set low_memory=False.
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
In [98]:
data.sort_values(by='rating_rank', inplace=True)
In [100]:
print(str(len(data)) + ' observations')
print(str(len(data['Instrument'].unique())) + ' unique firms')
85553 observations
1858 unique firms
In [62]:
"WITHOUT FINANCIALS"

plt.figure(figsize = (20, 10), dpi = 200)
sns.countplot(x='Issuer Rating', data=data).set_title('All-classes')
# class C is not in graph as there are only 2 observations and for classifiers to run this has to removed. 
Out[62]:
Text(0.5, 1.0, 'All-classes')
In [90]:
"Function to plot the missing values"

def missing_value_plot(df,title):
    null_counts = df.isnull().sum()/len(df)
    null_counts = null_counts.sort_values()
    plt.figure(figsize=(16,8))
    plt.xticks(np.arange(len(null_counts))+0.5,null_counts.index,rotation='vertical')
    plt.ylabel('fraction of rows with missing data (%)')
    plt.bar(np.arange(len(null_counts)),null_counts)
    plt.title(title)
    plt.xlabel('Features')
In [91]:
financial_features = ['accrualq', 'aftret_eqq', 'aftret_equityq', 'aftret_invcapxq', 'at_turnq', 
                      'bmq', 'capeiq', 'capital_ratioq', 'cash_conversionq', 'cash_debtq', 'cash_ltq',
                      'cash_ratioq', 'cfmq', 'curr_debtq', 'curr_ratioq', 'debt_atq', 'de_ratioq', 
                      'debt_capitalq', 'debt_ebitdaq', 'debt_invcapq', 'divyieldq', 'dltt_beq', 
                      'dprq', 'efftaxq', 'equity_invcapq', 'evmq', 'fcf_ocfq', 'gpmq', 'GProfq', 
                      'int_debtq', 'int_totdebtq', 'intcov_ratioq', 'intcovq', 'inv_turnq', 
                      'invt_actq', 'lt_atq', 'lt_debtq', 'lt_ppentq', 'mktcap', 'npmq', 'ocf_lctq', 
                      'opmadq', 'opmbdq', 'pay_turnq', 'pcfq', 'pe_exiq', 'pe_incq', 'pe_op_basicq',
                      'pe_op_dilq', 'PEG_1yrforward', 'PEG_ltgforward', 'PEG_trailing', 'pretret_earnatq',
                      'pretret_noq', 'price_adj', 'price_unadj', 'profit_lctq', 'psq', 'ptbq', 'ptpmq',
                      'quick_ratioq', 'rd_saleq', 'rect_actq', 'rect_turnq', 'roaq', 'roceq', 'roeq',
                      'sale_equityq', 'sale_invcapq', 'sale_nwcq', 'short_debtq', 'totdebt_invcapq',
                      'gind', 'ggroup', 'gsector']
missing_value_plot(data[financial_features], 'Financial ratios - without gsector 40') # financial ratios missing data + 3 categorical industry 
In [92]:
"""Financial ratios: divyield should be filled with 0 as these would be firms with no dividend, 
therefore median not correct to use. This is probable for some of the debt ratios too as some firms might not actually 
have any debt leading to such high missing values. rd_saleq needs to go. Similarly, PE ratio types exhibit
lots of missing data"""
categorical_ind = ['gind', 'ggroup', 'gsector']
In [93]:
missing_value_plot(data[['Rec_Total','Rec_Median','Rec_Low','Rec_High','Rec_SBuy','Rec_Buy','Rec_Hold',
 'Rec_Sell','Rec_SSell','Rec_NoOpinion','LTG_Mean','Num_Analyst']], 'Analyst recommendations')

'''NOTE: Once a firm has a recommendation, all the columns have values. This missing data is prior to the 
firm having any recommendations. What should we do about this?'''
Out[93]:
'NOTE: Once a firm has a recommendation, all the columns have values. This missing data is prior to the \nfirm having any recommendations. What should we do about this?'
In [94]:
categorical_inst = ['Top10InstOwn','NumInstBlockOwners','InstBlockOwn','NumInstOwners',
 'MaxInstOwn','InstOwn','InstOwn_HHI','InstOwn_Perc']
missing_value_plot(data[categorical_inst], "Inst holdings")
In [95]:
macro_features = ['balance_on_trade_bop', 'business_confidence', 'cab%gdp', 'central_government_deficit', 
                          'consumer_confidence', 'consumer_spending', 'current_account_balance', 
                          'export_goods_bop', 'export_prices', 'exports_goods_services', 'foreign_trade_balance', 
                          'government_bond', 'government_external_debt', 'government_spending', 
                          'gross_fixed_capital_investment', 'import_prices', 'imports_goods_bop', 
                          'imports_goods_services', 'industrial_production', 'international_reserves',
                          'labour_force_survey', 'merchandise_exports', 'merchandise_imports', 'money_supply_M0', 
                          'money_supply_M1', 'money_supply_M2', 'overnight_rate', 'population', 
                          'retail_sales', 'sov_outlook_rank','stock_index', 'terms_of_trade', 
                          'unemployment_rate']
missing_value_plot(data[macro_features], 'Macro + Macro growth')
In [66]:
categorical_inst = ['Top10InstOwn','NumInstBlockOwners','InstBlockOwn','NumInstOwners',
 'MaxInstOwn','InstOwn','InstOwn_HHI','InstOwn_Perc']
categorical_macro = ['sov_rating_rank','sov_outlook_rank']
categorical_analyst = ['Rec_Total','Rec_Median','Rec_Low','Rec_High','Rec_SBuy','Rec_Buy','Rec_Hold',
 'Rec_Sell','Rec_SSell','Rec_NoOpinion','LTG_Mean','Num_Analyst']
categorical_ind = ['gind', 'ggroup', 'gsector']
In [67]:
categorical = categorical_ind + categorical_inst + categorical_analyst + categorical_macro
In [68]:
'''Box plot of inst categorical features'''

fig, ax = plt.subplots(3, 3, figsize=(30, 30))
for var, subplot in zip(categorical_inst, ax.flatten()):
    sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
In [69]:
'''Box plot of analyst categorical features'''

fig, ax = plt.subplots(3, 3, figsize=(30, 30))
for var, subplot in zip(categorical_analyst, ax.flatten()):
    sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
In [70]:
'''Box plot of industry categorical features'''

fig, ax = plt.subplots(1, 3, figsize=(20, 5))
for var, subplot in zip(categorical_ind, ax.flatten()):
    sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
In [71]:
'''Box plot of financial ratio features - Capitalization'''

fin_capitalization=['capital_ratioq','equity_invcapq','debt_invcapq','totdebt_invcapq']

fig, ax = plt.subplots(2,2, figsize=(20, 10))
for var, subplot in zip(fin_capitalization, ax.flatten()):
    sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
    
In [75]:
'''Box plot of financial ratio features - Efficiency'''

fin_efficiency=['at_turnq','inv_turnq','pay_turnq','rect_turnq','sale_equityq','sale_invcapq','sale_nwcq']

fig, ax = plt.subplots(4,2, figsize=(30, 30))
for var, subplot in zip(fin_efficiency, ax.flatten()):
    sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
In [76]:
'''Box plot of financial ratio features - Financial Soundness'''

fin_finsoundness=['invt_actq','rect_actq','fcf_ocfq','ocf_lctq','cash_debtq',
                'cash_ltq','cfmq','short_debtq','profit_lctq',
                'curr_debtq','debt_ebitdaq','dltt_beq','int_debtq',
                'int_totdebtq','lt_debtq','lt_ppentq']

fig, ax = plt.subplots(8,2, figsize=(30, 30))
for var, subplot in zip(fin_finsoundness, ax.flatten()):
    sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
In [77]:
'''Box plot of financial ratio features - Liquidity'''

fin_liquidity=['cash_ratioq','curr_ratioq','quick_ratioq','accrualq','rd_saleq']

fig, ax = plt.subplots(3,2, figsize=(30, 30))
for var, subplot in zip(fin_liquidity, ax.flatten()):
    sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
In [78]:
'''Box plot of financial ratio features - Profitability''' #NO FINANCIALS

fin_profitability = ['efftaxq','GProfq','aftret_eqq','aftret_equityq','aftret_invcapxq','gpmq','npmq','opmadq',
'opmbdq','pretret_earnatq','pretret_noq','ptpmq','roaq','roceq','roeq']

fig, ax = plt.subplots(8,2, figsize=(30, 30))
for var, subplot in zip(fin_profitability, ax.flatten()):
    sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
In [79]:
'''Box plot of financial ratio features - Profitability'''

fin_profitability = ['efftaxq','GProfq','aftret_eqq','aftret_equityq','aftret_invcapxq','gpmq','npmq','opmadq',
'opmbdq','pretret_earnatq','pretret_noq','ptpmq','roaq','roceq','roeq']

fig, ax = plt.subplots(8,2, figsize=(30, 30))
for var, subplot in zip(fin_profitability, ax.flatten()):
    sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
In [80]:
'''Box plot of financial ratio features - Solvency'''

fin_solvency = ['de_ratioq','debt_atq','lt_atq','debt_capitalq','intcovq','intcov_ratioq']

fig, ax = plt.subplots(3,2, figsize=(30, 30))
for var, subplot in zip(fin_solvency, ax.flatten()):
    sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
In [81]:
'''Box plot of financial ratio features - Valuation'''

fin_valuation = ['dprq','PEG_1yrforward','PEG_ltgforward','PEG_trailing','bmq','capeiq','divyieldq',
 'evmq','pcfq','pe_exiq','pe_incq','pe_op_basicq','pe_op_dilq','psq','ptbq']

fig, ax = plt.subplots(8,2, figsize=(30, 30))
for var, subplot in zip(fin_valuation, ax.flatten()):
    sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
In [82]:
'''Box plot of financial ratio features - other'''

fin = ['mktcap','price_adj']

fig, ax = plt.subplots(1,3, figsize=(30, 10))
for var, subplot in zip(fin, ax.flatten()):
    sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
In [87]:
'''Macro and macro_growth features'''
macro_features_growth = [col for col in data if col.endswith('%')]
In [88]:
'''Box plot of numeric macro features'''

fig, ax = plt.subplots(9, 4, figsize=(40, 30))
for var, subplot in zip(macro_features, ax.flatten()):
    sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
    
In [97]:
'''Box plot of numeric macro growth features'''

fig, ax = plt.subplots(9, 4, figsize=(40, 30))
for var, subplot in zip(macro_features_growth, ax.flatten()):
    sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)